"""
!pip install wheel
!pip install pipwin
!pipwin install numpy
!pipwin install pandas
!pipwin install shapely
!pipwin install gdal
!pipwin install fiona
!pipwin install pyproj
!pipwin install six
!pipwin install rtree
!pipwin install geopandas
!pip install geocoder
!pip3 install folium
!pip3 install beautifulsoup4
!pip3 install seaborn
!pip install missingno
"""
import pandas as pd
from pandas.api.types import CategoricalDtype
import requests
import geocoder
import folium
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
import seaborn as sns
import missingno as msno
import numpy as np
from pylab import rcParams
import geopandas as gpd
data_2015 = pd.read_csv('./data/crime-incident-reports-2015.csv')
data_2016 = pd.read_csv('./data/crime-incident-reports-2016.csv')
data_2017 = pd.read_csv('./data/crime-incident-reports-2017.csv')
data_2018 = pd.read_csv('./data/crime-incident-reports-2018.csv')
district = pd.read_csv('./data/district.csv')
print('2015: ',data_2015.shape)
print('2016: ',data_2016.shape)
print('2017: ',data_2017.shape)
print('2018: ',data_2018.shape)
total_row = data_2015.shape[0] + data_2016.shape[0] + data_2017.shape[0] + data_2018.shape[0]
print(total_row)
data = pd.concat([data_2015,data_2016,data_2017,data_2018])
data.shape
data.head(5)
district = district.set_index("DISTRICT")
#district.head()
dict_district = district.to_dict()
dict_district = dict_district['DISTRICT_NAME']
dict_district
data = data.drop(data[data.DISTRICT=='External'].index)
data = data[data.DISTRICT.notna()]
data.shape
data['DISTRICT'].unique()
#data['District_name'] = district_name
#data.head(15)
district_name=[]
for i in data['DISTRICT']:
for j in dict_district:
if (i ==j):
district_name.append(dict_district[j])
data['District_name'] = district_name
data.head()
msno.matrix(data)
plt.show()
Since most of the cell from shooting columns are nan, this column will be deleted.
data = data.drop(columns='SHOOTING')
data.head()
data = data.dropna(subset=['Lat','Long'])
msno.matrix(data)
plt.show()
data['OCCURRED_ON_DATE'] = pd.to_datetime(data['OCCURRED_ON_DATE'])
data["DAY_OF_WEEK"] = pd.Categorical(data["DAY_OF_WEEK"],
categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],
ordered=True)
def create_features(df):
df['dayofweek'] = df['OCCURRED_ON_DATE'].dt.dayofweek
df['quarter'] = df['OCCURRED_ON_DATE'].dt.quarter
df['dayofyear'] = df['OCCURRED_ON_DATE'].dt.dayofyear
df['dayofmonth'] = df['OCCURRED_ON_DATE'].dt.day
df['weekofyear'] = df['OCCURRED_ON_DATE'].dt.weekofyear
X = df[['dayofweek','quarter','dayofyear',
'dayofmonth','weekofyear']]
return X
create_features(data).head()
# CategoricalDytpe
data.quarter = data.quarter.astype(CategoricalDtype())
data.dayofweek = data.dayofweek.astype(CategoricalDtype())
data.dayofyear = data.dayofyear.astype(CategoricalDtype())
data.dayofmonth = data.dayofmonth.astype(CategoricalDtype())
data.head()
rename = {'OFFENSE_CODE_GROUP':'Group',
'OFFENSE_DESCRIPTION':'Description',
'DISTRICT':'District',
'STREET':'Street',
'OCCURRED_ON_DATE':'Date',
'YEAR':'Year',
'MONTH':'Month',
'DAY_OF_WEEK':'Day',
'HOUR':'Hour'}
data.rename(index=str, columns=rename, inplace=True)
data = data[['INCIDENT_NUMBER', 'OFFENSE_CODE', 'Group', 'Description',
'Date', 'Year', 'Month', 'Day', 'Hour','dayofweek',
'quarter', 'dayofyear', 'dayofmonth', 'weekofyear','District', 'District_name' ,'REPORTING_AREA', 'UCR_PART',
'Street', 'Lat', 'Long', 'Location']]
data.head()
data.describe().T
dis = data.groupby(by=["District","UCR_PART"]).size()
s = dis.to_frame()
s = s.reset_index()
s.columns = ["District","UCR_PART","Crime Counts"]
ax = sns.barplot(x ="District", y = 'Crime Counts', data = s, hue = "UCR_PART" )
plt.legend(title = 'UCR_PART', bbox_to_anchor = (1, 1))
rcParams["figure.figsize"] = 18,7
sns.set(font_scale=1.75)
order = data['Group'].value_counts().head(5).index
sns.countplot(data = data, x='Group',hue='District', order = order, saturation=2,linewidth=1)
sns.heatmap(pd.pivot_table(data = data, index = "dayofweek",
columns = "Hour", values = "INCIDENT_NUMBER", aggfunc = 'count'),
cmap = 'Reds')
grouped = data.groupby(['Month','District']).count()
sns.boxplot(x ="Month", y = "Group", data = grouped.reset_index(), palette="ch:.102");
district = data['District_name'].unique()
latitude=[]
longitude=[]
coor = []
for i in range (len(district)):
# response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address={}+BOSTON+MA&key=AIzaSyCGloVfNsewW00vIK30g9-GnrkJl3vH63s'.format(district[i]))
# resp_json_payload = response.json()
# coor.append([a['results'][0]['geometry']['location']['lat'],a['results'][0]['geometry']['location']['lng']])
# latitude.append(a['results'][0]['geometry']['location']['lat'])
# longitude.append(a['results'][0]['geometry']['location']['lng'])
g = geocoder.arcgis('{} ,Boston'.format(district[i]))
lat_lng_coords = g.latlng
coor.append(lat_lng_coords)
latitude.append(lat_lng_coords[0])
longitude.append(lat_lng_coords[1])
district_coor = pd.DataFrame()
district_coor['district'] = district
district_coor['coor'] = coor
district_coor['latitude'] = latitude
district_coor['longitude'] = longitude
district_coor.head(15)
heatmap = folium.Map(location=coor[0], zoom_start=12)
"""
for index, row in district_coor.iterrows():
folium.CircleMarker(
row['coor'],
radius=5,
color='red',
fill=True,
popup = row['district'],
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(heatmap)
"""
heatmap
district_coor.at[1,'coor'] = [42.293066, -71.071760]
district_coor.at[1,'latitude'] = 42.293066
district_coor.at[1,'longitude'] = -71.071760
district_coor.at[7,'coor'] = [42.370918, -71.039203]
district_coor.at[7,'latitude'] = 42.370918
district_coor.at[7,'longitude'] = -71.039203
district_coor.at[8,'coor'] = [42.280873, -71.162792]
district_coor.at[8,'latitude'] = 42.280873
district_coor.at[8,'longitude'] = -71.162792
district_coor.at[9,'coor'] = [42.337805, -71.049307]
district_coor.at[9,'latitude'] = 42.337805
district_coor.at[9,'longitude'] = -71.049307
district_coor.at[11,'coor'] = [42.378547, -71.061281]
district_coor.at[11,'latitude'] = 42.378547
district_coor.at[11,'longitude'] = -71.061281
district_coor
gpf = gpd.read_file("./data/Zoning_Districts.geojson")
gpf
style = {'fillColor': '#00000000', 'color': '#000000','weight':2 , 'opacity':0.5}
folium.GeoJson(data=gpf['geometry'],style_function=lambda x: style).add_to(heatmap)
heatmap
for index, row in district_coor.iterrows():
folium.CircleMarker(
row['coor'],
radius=5,
color='red',
fill=True,
popup = row['district'],
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(heatmap)
heatmap
from folium import plugins
heat = data[['Lat', 'Long']].values
heatmap.add_children(plugins.HeatMap(heat,radius=13))
heatmap